Classifying Public Institutions By Effectiveness
- Looking at Multicollinearity
- Cross validation
- Regression
- Classification
- Low High
- PCA and Random Forest on all Features
- Random Forest with Trimmed Variables
- Decision Tree without gridsearchcv
- Dummy Classifier
- Low Medium High
- PCA and Random Forest on all Features
- Random Forest
- Decision Tree
- Decision Tree without gridsearchcv
- Dummy Classifier
- Quartiles
- PCA and Random Forest on all Features
- Random Forest with Trimmed Set
- Decision Tree on Trimmed Set
- Decision Tree without gridsearchcv
- Dummy Classifier
- Without Selectors
#!pip install graphviz
#!pip install matplotlib --upgrade
#!pip install scikit-learn --upgrade
#!pip install dtreeviz
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, accuracy_score, auc, roc_curve, explained_variance_score, r2_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import graphviz
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.preprocessing import normalize
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyClassifier
import random
%matplotlib inline
# set max columns to none
pd.set_option("display.max_columns", None)
# set colwidth hidher
pd.set_option('display.max_colwidth', 100)
##set seed for reproducibility
random.seed(21)
ipeds = pd.read_csv("/dsa/groups/capstonesp2022/on-campus/group_2/IPEDSApril26_updated.csv")
ipeds = ipeds.drop(columns = ['Unnamed: 0'])
ipeds.head()
ipeds = pd.get_dummies(ipeds, columns=["STATE"], prefix=["US"] )
dropped = ipeds.dropna()
print(dropped.shape)
dropped.head()
##split into test and train
##split into train test/val
train, testVal = train_test_split(dropped, test_size=0.25, random_state = 21)
##split into test validation
test, validation = train_test_split(testVal, test_size = .5, random_state = 21)
print(train.shape)
print(test.shape)
print(validation.shape)
X_train = train.loc[:, ~dropped.columns.isin(['unitid', 'Grad_Rates_Two_Classes', 'exp_acad_inst_student_total_per','Grad_Rates_Three_Classes', 'Grad_Rates_Quartiles', 'completion_rate_150pct', 'year', 'cc_basic_2010'])]
X_test = test.loc[:, ~dropped.columns.isin(['unitid', 'Grad_Rates_Two_Classes', 'exp_acad_inst_student_total_per','Grad_Rates_Three_Classes', 'Grad_Rates_Quartiles', 'completion_rate_150pct', 'year', 'cc_basic_2010'])]
y_train = train.completion_rate_150pct
y_test = test.completion_rate_150pct
X_train.head(1)
print(np.all(np.isfinite(X_train)))
print(np.any(np.isnan(X_train)))
corrMatrix=X_train.corr()
corrMatrix.style.background_gradient(cmap='coolwarm')
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(15,15))
matrix = corrMatrix.corr().round(2)
sns.heatmap(matrix, annot=False, vmax=1, vmin=-1, center=0, cmap='vlag')
plt.savefig('CorrelationMatrix',dpi=300)
scores = cross_val_score(LinearRegression(), X_train, y_train, cv = 10)
scores
X_train = train.loc[:, ~dropped.columns.isin(['unitid', 'exp_acad_inst_student_total_per', 'Grad_Rates_Two_Classes', 'Grad_Rates_Three_Classes', 'Grad_Rates_Quartiles', 'completion_rate_150pct', 'year', 'cc_basic_2010'])]
X_test = test.loc[:, ~dropped.columns.isin(['unitid', 'exp_acad_inst_student_total_per', 'Grad_Rates_Two_Classes', 'Grad_Rates_Three_Classes', 'Grad_Rates_Quartiles', 'completion_rate_150pct', 'year', 'cc_basic_2010'])]
y_train = train.Grad_Rates_Two_Classes
y_test = test.Grad_Rates_Two_Classes
X_train.head(1)
##anomoly detection and outlier removal
from sklearn.covariance import EllipticEnvelope
envelope1 = EllipticEnvelope(support_fraction=1, contamination=0.03).fit(X_train)
outliers1 = envelope1.predict(X_train)==-1
X_clean1 = X_train[~outliers1]
y_clean1 = y_train[~outliers1]
print(f"Num of outliers = {np.sum(outliers1)}")
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_clean1, y_clean1)
##grab importance values
importances = rf.feature_importances_
##sort the indices
sorted_indices = np.argsort(importances)[::-1]
import matplotlib.pyplot as plt
#ax.set_title('Feature Importance All Variables')
plt.rcdefaults()
plt.figure(figsize=(15,15))
plt.barh(range(X_train.shape[1]), importances[sorted_indices], align='center', color = "darkgreen")
plt.yticks(range(X_train.shape[1]), X_train.columns[sorted_indices], fontsize=16)
plt.xticks(fontsize=20)
plt.savefig('FeatureImportanceALLVariables.png',dpi=300)
X_clean2 = X_clean1.iloc[: , :21]
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_clean2, y_clean1)
##grab importance values
importances = rf.feature_importances_
##sort the indices
sorted_indices = np.argsort(importances)[::-1]
import matplotlib.pyplot as plt
#ax.set_title('Feature Importance All Variables')
plt.rcdefaults()
plt.figure(figsize=(10,10))
plt.barh(range(X_clean2.shape[1]), importances[sorted_indices], align='center', color = "darkgreen")
plt.yticks(range(X_clean2.shape[1]), X_clean2.columns[sorted_indices], fontsize=16)
plt.xticks(fontsize=20)
plt.savefig('FeatureImportance21Variables.png',dpi=300)
X_train = train[['est_fte', 'exp_instruc_total_per', 'exp_acad_supp_total_per', 'exp_student_serv_total_per', 'exp_inst_supp_total_per', 'act_composite_75_pctl', 'act_composite_25_pctl', 'acceptance_rate']]
X_test = test[['est_fte', 'exp_instruc_total_per', 'exp_acad_supp_total_per', 'exp_student_serv_total_per', 'exp_inst_supp_total_per', 'act_composite_75_pctl', 'act_composite_25_pctl', 'acceptance_rate']]
y_train = train.completion_rate_150pct
y_test = test.completion_rate_150pct
X_train.head(1)
##Export a matrix for presentation
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
corrMatrix=X_train.corr()
plt.figure(figsize=(15,15))
plt.xticks(fontsize=20,)
plt.yticks(fontsize=20)
matrix = corrMatrix.corr().round(2)
sns.heatmap(matrix, annot=False, vmax=1, vmin=-1, center=0, cmap='vlag')
plt.xticks(rotation = 45)
plt.savefig('CorrelationMatrixTrimmed.png',dpi=300)
X_trainFull = train.loc[:, ~dropped.columns.isin(['unitid', 'Grad_Rates_Two_Classes', 'exp_acad_inst_student_total_per','Grad_Rates_Three_Classes', 'Grad_Rates_Quartiles', 'completion_rate_150pct', 'year', 'cc_basic_2010'])]
X_testFull = test.loc[:, ~dropped.columns.isin(['unitid', 'Grad_Rates_Two_Classes', 'exp_acad_inst_student_total_per','Grad_Rates_Three_Classes', 'Grad_Rates_Quartiles', 'completion_rate_150pct', 'year', 'cc_basic_2010'])]
X_trainTrimmed = train[['est_fte', 'exp_instruc_total_per', 'exp_acad_supp_total_per', 'exp_student_serv_total_per', 'exp_inst_supp_total_per', 'act_composite_75_pctl', 'act_composite_25_pctl', 'acceptance_rate']]
X_testTrimmed = test[['est_fte', 'exp_instruc_total_per', 'exp_acad_supp_total_per', 'exp_student_serv_total_per', 'exp_inst_supp_total_per', 'act_composite_75_pctl', 'act_composite_25_pctl', 'acceptance_rate']]
y_trainRate = train.completion_rate_150pct
y_testRate = test.completion_rate_150pct
y_train2 = train.Grad_Rates_Two_Classes
y_test2 = test.Grad_Rates_Two_Classes
y_train3 = train.Grad_Rates_Three_Classes
y_test3 = test.Grad_Rates_Three_Classes
y_train4 = train.Grad_Rates_Quartiles
y_test4 = test.Grad_Rates_Quartiles
##regression sets
from sklearn.covariance import EllipticEnvelope
envelope1 = EllipticEnvelope(support_fraction=1, contamination=0.03).fit(X_trainFull)
outliers1 = envelope1.predict(X_trainFull)==-1
X_cleanFullRate = X_trainFull[~outliers1]
y_cleanFullRate = y_trainRate[~outliers1]
print(f"Num of outliers = {np.sum(outliers1)}")
from sklearn.covariance import EllipticEnvelope
envelope1b = EllipticEnvelope(support_fraction=1, contamination=0.03).fit(X_trainTrimmed)
outliers1b = envelope1b.predict(X_trainTrimmed)==-1
X_cleanTrimmedRate = X_trainTrimmed[~outliers1b]
y_cleanTrimmedRate = y_trainRate[~outliers1b]
print(f"Num of outliers = {np.sum(outliers1b)}")
print(X_cleanFullRate.shape)
print(X_cleanTrimmedRate.shape)
X_cleanTrimmedRate.head()
##anomoly detection and outlier removal
from sklearn.covariance import EllipticEnvelope
envelope2 = EllipticEnvelope(support_fraction=1, contamination=0.03).fit(X_trainFull)
outliers2 = envelope2.predict(X_trainFull)==-1
X_clean2Full = X_trainFull[~outliers2]
y_clean2Full = y_train2[~outliers2]
print(f"Num of outliers = {np.sum(outliers2)}")
envelope2b = EllipticEnvelope(support_fraction=1, contamination=0.03).fit(X_trainTrimmed)
outliers2b = envelope2b.predict(X_trainTrimmed)==-1
X_clean2Trimmed = X_trainTrimmed[~outliers2b]
y_clean2Trimmed = y_train2[~outliers2b]
print(f"Num of outliers = {np.sum(outliers2b)}")
##anomoly detection and outlier removal
envelope3 = EllipticEnvelope(support_fraction=1, contamination=0.03).fit(X_trainFull)
outliers3 = envelope3.predict(X_trainFull)==-1
X_clean3Full = X_trainFull[~outliers3]
y_clean3Full = y_train3[~outliers3]
print(f"Num of outliers = {np.sum(outliers3)}")
envelope3b = EllipticEnvelope(support_fraction=1, contamination=0.03).fit(X_trainTrimmed)
outliers3b = envelope3b.predict(X_trainTrimmed)==-1
X_clean3Trimmed = X_trainTrimmed[~outliers3b]
y_clean3Trimmed = y_train3[~outliers3b]
print(f"Num of outliers = {np.sum(outliers3b)}")
##anomoly detection and outlier removal
envelope4 = EllipticEnvelope(support_fraction=1, contamination=0.03).fit(X_trainFull)
outliers4 = envelope4.predict(X_trainFull)==-1
X_clean4Full = X_trainFull[~outliers4]
y_clean4Full = y_train4[~outliers4]
print(f"Num of outliers = {np.sum(outliers4)}")
envelope4b = EllipticEnvelope(support_fraction=1, contamination=0.03).fit(X_trainTrimmed)
outliers4b = envelope4b.predict(X_trainTrimmed)==-1
X_clean4Trimmed = X_trainTrimmed[~outliers4b]
y_clean4Trimmed = y_train4[~outliers4b]
print(f"Num of outliers = {np.sum(outliers4b)}")
#set the pca_components
#pca_components = 3
pipe0 = Pipeline([
('LinearRegression', LinearRegression())
])
param_grid0 = {
'LinearRegression__n_jobs': [1,3,5]
}
model_grid0 = GridSearchCV(pipe, param_grid0, cv = 5, n_jobs = 5)
model_grid0.fit(X_cleanTrimmedRate, y_cleanTrimmedRate)
model_grid0.best_params_ ##Best hyperparameters
print(model_grid0.best_score_)
y_predicted = model_grid0.predict(X_testTrimmed)
##explained variance
explained_variance_score(y_testRate, y_predicted)
r2_score(y_testRate, y_predicted)
plt.figure(figsize=(15,15))
plt.scatter(y_testRate, y_predicted, c='crimson')
#plt.yscale('log')
#plt.xscale('log')
p1 = max(max(y_predicted), max(y_testRate))
p2 = min(min(y_predicted), min(y_testRate))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=20)
plt.ylabel('Predictions', fontsize=20)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.axis('equal')
plt.savefig('RegressionAcutalVsPredicted.png',dpi=300)
#pca_components = 3
pipe = Pipeline([
('PCA', PCA()),
('LinearRegression', LinearRegression())
])
param_grid = {
'PCA__n_components': [1, 10, 15, 20, 25, 30, 35, 40, 50, 55, 60, 65, 70],
'LinearRegression__n_jobs': [1,3,5]
}
model_grid = GridSearchCV(pipe, param_grid, cv = 5, n_jobs = 5)
model_grid.fit(X_cleanFullRate, y_cleanFullRate)
model_grid.best_params_ ##Best hyperparameters
print(model_grid.best_score_)
y_predicted = model_grid.predict(X_testFull)
##explained variance
explained_variance_score(y_testRate, y_predicted)
r2_score(y_testRate, y_predicted)
plt.figure(figsize=(15,15))
plt.scatter(y_testRate, y_predicted, c='crimson')
#plt.yscale('log')
#plt.xscale('log')
p1 = max(max(y_predicted), max(y_testRate))
p2 = min(min(y_predicted), min(y_testRate))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=20)
plt.ylabel('Predictions', fontsize=20)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.axis('equal')
plt.savefig('RegressionAcutalVsPredictedPCA.png',dpi=300)
pipe2 = Pipeline([
('PCA', PCA()),
('RandomForestClassifier', RandomForestClassifier())
])
param_grid2 = {
'PCA__n_components': [1, 10, 30, 50, 70],
'RandomForestClassifier__n_estimators': [50, 150],
'RandomForestClassifier__criterion': ['gini', 'entropy'],
'RandomForestClassifier__max_depth': [8]
}
model_grid2 = GridSearchCV(pipe2, param_grid2, cv = 5, n_jobs = 2)
model_grid2.fit(X_clean2Full, y_clean2Full)
model_grid2.best_params_ ##Best hyperparameters
print(model_grid2.best_score_)
accuracy_score(y_test2, model_grid2.predict(X_testFull))
##get predictions
y_predicted = model_grid2.predict(X_testFull)
##Auc Score
fpr, tpr, thresholds = roc_curve(y_test2, y_predicted)
auc(fpr, tpr)
y_predicted = model_grid2.predict(X_testFull)
y_predicted[1:20]
print(confusion_matrix(y_test2, y_predicted))
print(classification_report(y_test2, y_predicted))
plt.figure(figsize=(20,20))
cmd = ConfusionMatrixDisplay.from_estimator(model_grid2, X_testFull, y_test2, normalize = 'all', display_labels=['Low','High'])
cmd.figure_.savefig('RandomPCA_LH_Fig1F.png',dpi=300)
pipe3 = Pipeline([
('RandomForestClassifier', RandomForestClassifier())
])
param_grid3 = {
'RandomForestClassifier__n_estimators': [50, 150],
'RandomForestClassifier__criterion': ['gini', 'entropy'],
'RandomForestClassifier__max_depth': [8]
}
model_grid3 = GridSearchCV(pipe3, param_grid3, cv = 5, n_jobs = 5)
model_grid3.fit(X_clean2Trimmed, y_clean2Trimmed)
model_grid3.best_params_ ##Best hyperparameters
y_predicted = model_grid3.predict(X_testTrimmed)
y_predicted[1:20]
print(confusion_matrix(y_test2, y_predicted))
print(classification_report(y_test2, y_predicted))
print(model_grid3.best_score_)
accuracy_score(y_test2, model_grid3.predict(X_testTrimmed))
##get predictions
y_predicted = model_grid3.predict(X_testTrimmed)
##Auc Score
fpr, tpr, thresholds = roc_curve(y_test2, y_predicted)
auc(fpr, tpr)
plt.figure(figsize=(20,20))
cmd = ConfusionMatrixDisplay.from_estimator(model_grid3, X_testTrimmed, y_test2, normalize = 'all', display_labels=['Low','High'])
cmd.figure_.savefig('Random_LH_Fig2F.png',dpi=300)
pipeDT2 = Pipeline([
('DecisionTreeClassifier', DecisionTreeClassifier())
])
param_gridDT2 = {
'DecisionTreeClassifier__min_samples_leaf': [10, 50, 150, 300],
'DecisionTreeClassifier__criterion': ['gini', 'entropy'],
'DecisionTreeClassifier__max_depth': [5, 10, 15, 20]
}
model_gridDT2 = GridSearchCV(pipeDT2, param_gridDT2, cv = 5, n_jobs = 5)
model_gridDT2.fit(X_clean2Trimmed, y_clean2Trimmed)
model_gridDT2.best_params_ ##Best hyperparameters
y_predicted = model_gridDT2.predict(X_testTrimmed)
y_predicted[1:20]
print(confusion_matrix(y_test2, y_predicted))
print(classification_report(y_test2, y_predicted))
print(model_gridDT2.best_score_)
accuracy_score(y_test2, model_gridDT2.predict(X_testTrimmed))
##get predictions
y_predicted = model_gridDT2.predict(X_testTrimmed)
##Auc Score
fpr, tpr, thresholds = roc_curve(y_test2, y_predicted)
auc(fpr, tpr)
plt.figure(figsize=(20,20))
cmd = ConfusionMatrixDisplay.from_estimator(model_gridDT2, X_testTrimmed, y_test2, normalize = 'all', display_labels=['Low','High'])
cmd.figure_.savefig('DecisionTree_LH_Fig2F.png',dpi=300)
clf = DecisionTreeClassifier(max_depth = 5, criterion = 'gini', min_samples_leaf = 150, random_state=21)
model = clf.fit(X_clean2Trimmed, y_clean2Trimmed)
feature_cols = []
for i in X_clean2Trimmed.columns.values:
feature_cols.append(i)
type(feature_cols)
target_names=['Low','High']
feature_cols
X = X_clean2Trimmed
y = y_clean2Trimmed
pred = clf.predict(X_testTrimmed)
acc = accuracy_score(pred, y_test2)
acc
fig = plt.figure(figsize=(35,20))
_ = tree.plot_tree(clf, fontsize=12,
feature_names=feature_cols,
class_names=target_names,
filled=True)
fig.savefig('Tree2ClassF.png',dpi=300)
pipeDum2 = Pipeline([
('DummyClassifier', DummyClassifier())
])
param_gridDum2 = {
'DummyClassifier__strategy': ['most_frequent', 'prior', 'stratified', 'uniform'],
}
model_gridDum2 = GridSearchCV(pipeDum2, param_gridDum2, cv = 5, n_jobs = 5)
model_gridDum2.fit(X_clean2Trimmed, y_clean2Trimmed)
model_gridDum2.best_params_ ##Best hyperparameters
y_predicted = model_gridDum2.predict(X_testTrimmed)
y_predicted[1:20]
print(confusion_matrix(y_test2, y_predicted))
print(classification_report(y_test2, y_predicted))
print(model_gridDum2.best_score_)
accuracy_score(y_test2, model_gridDum2.predict(X_testTrimmed))
##get predictions
y_predicted = model_gridDum2.predict(X_testTrimmed)
##Auc Score
fpr, tpr, thresholds = roc_curve(y_test2, y_predicted)
auc(fpr, tpr)
plt.figure(figsize=(20,20))
cmd = ConfusionMatrixDisplay.from_estimator(model_gridDum2, X_testTrimmed, y_test2, normalize = 'all', display_labels=['Low','High'])
cmd.figure_.savefig('Dummy_LH_Fig2F.png',dpi=300)
pipe4 = Pipeline([
('PCA', PCA()),
('RandomForestClassifier', RandomForestClassifier())
])
param_grid4 = {
'PCA__n_components': [1, 10, 30, 50, 70],
'RandomForestClassifier__n_estimators': [50, 150],
'RandomForestClassifier__criterion': ['gini', 'entropy'],
'RandomForestClassifier__max_depth': [8]
}
model_grid4 = GridSearchCV(pipe4, param_grid4, cv = 5, n_jobs = 2)
model_grid4.fit(X_clean3Full, y_clean3Full)
model_grid4.best_params_ ##Best hyperparameters
print(model_grid4.best_score_)
accuracy_score(y_test3, model_grid4.predict(X_testFull))
#Calculate the y_score
y_score = model_grid4.predict_proba(X_testFull)
##get auc score
roc_auc_score(y_test3, y_score, multi_class='ovr')
plt.figure(figsize=(20,20))
cmd = ConfusionMatrixDisplay.from_estimator(model_grid4, X_testFull, y_test3, normalize = 'all', display_labels=['Low','Medium', 'High'])
cmd.figure_.savefig('PCARandom_LMH_Fig3F.png',dpi=300)
pipe5 = Pipeline([
('RandomForestClassifier', RandomForestClassifier())
])
param_grid5 = {
'RandomForestClassifier__n_estimators': [50, 150],
'RandomForestClassifier__criterion': ['gini', 'entropy'],
'RandomForestClassifier__max_depth': [8]
}
model_grid5 = GridSearchCV(pipe5, param_grid5, cv = 5, n_jobs = 5)
model_grid5.fit(X_clean3Trimmed, y_clean3Trimmed)
model_grid5.best_params_ ##Best hyperparameters
y_predicted = model_grid5.predict(X_testTrimmed)
y_predicted[1:20]
print(confusion_matrix(y_test3, y_predicted))
print(classification_report(y_test3, y_predicted))
print(model_grid5.best_score_)
accuracy_score(y_test3, model_grid5.predict(X_testTrimmed))
#Calculate the y_score
y_score = model_grid5.predict_proba(X_testTrimmed)
##get auc score
roc_auc_score(y_test3, y_score, multi_class='ovr')
plt.figure(figsize=(20,20))
cmd = ConfusionMatrixDisplay.from_estimator(model_grid5, X_testTrimmed, y_test3, normalize = 'all', display_labels=['Low', 'Medium','High'])
cmd.figure_.savefig('Random_LMH_Fig4F.png',dpi=300)
pipeDT3 = Pipeline([
('DecisionTreeClassifier', DecisionTreeClassifier())
])
param_gridDT3 = {
'DecisionTreeClassifier__min_samples_leaf': [10, 50, 150, 300],
'DecisionTreeClassifier__criterion': ['gini', 'entropy'],
'DecisionTreeClassifier__max_depth': [5, 10, 15, 20]
}
model_gridDT3 = GridSearchCV(pipeDT3, param_gridDT3, cv = 5, n_jobs = 5)
model_gridDT3.fit(X_clean3Trimmed, y_clean3Trimmed)
print(model_gridDT3.best_params_) ##Best hyperparameters
##model evaluation
##get predictions
y_predicted = model_gridDT3.predict(X_testTrimmed)
print(confusion_matrix(y_test3, y_predicted))
print(classification_report(y_test3, y_predicted))
##print model score
print("Model Score", model_gridDT3.best_score_)
##Accuracy Score
print("Accuracy Score",accuracy_score(y_test3, model_gridDT3.predict(X_testTrimmed)))
##get auc score
#Calculate the y_score
y_score = model_gridDT3.predict_proba(X_testTrimmed)
print("AUC Score", roc_auc_score(y_test3, y_score, multi_class='ovr'))
plt.figure(figsize=(20,20))
cmd = ConfusionMatrixDisplay.from_estimator(model_gridDT3, X_testTrimmed, y_test3, normalize = 'all', display_labels=['Low','Medium','High'])
cmd.figure_.savefig('DecisionTree_LMH_Fig4F.png',dpi=300)
clf = DecisionTreeClassifier(max_depth = 10, criterion = 'gini', min_samples_leaf = 50, random_state=21)
model = clf.fit(X_clean3Trimmed, y_clean3Trimmed)
feature_cols = []
for i in X_clean3Trimmed.columns.values:
feature_cols.append(i)
type(feature_cols)
target_names=['Low', 'Medium', 'High']
feature_cols
X = X_clean3Trimmed
y = y_clean3Trimmed
fig = plt.figure(figsize=(45,20))
_ = tree.plot_tree(clf, fontsize=12,
feature_names=feature_cols,
class_names=target_names,
filled=True)
fig.savefig('Tree3ClassF.png',dpi=300)
pred = clf.predict(X_testTrimmed)
acc = accuracy_score(pred, y_test3)
acc
pipeDum3 = Pipeline([
('DummyClassifier', DummyClassifier())
])
param_gridDum3 = {
'DummyClassifier__strategy': ['most_frequent', 'prior', 'stratified', 'uniform'],
}
model_gridDum3 = GridSearchCV(pipeDum3, param_gridDum3, cv = 5, n_jobs = 5)
model_gridDum3.fit(X_clean3Trimmed, y_clean3Trimmed)
model_gridDum3.best_params_ ##Best hyperparameters
y_predicted = model_gridDum3.predict(X_testTrimmed)
y_predicted[1:20]
print(confusion_matrix(y_test3, y_predicted))
print(classification_report(y_test3, y_predicted))
##print model score
print("Model Score:", model_gridDum3.best_score_)
##Accuracy Score
print("Accuracy Score:",accuracy_score(y_test3, model_gridDum3.predict(X_testTrimmed)))
#Calculate the y_score
y_score = model_gridDum3.predict_proba(X_testTrimmed)
print("AUC Score", roc_auc_score(y_test3, y_score, multi_class='ovr'))
plt.figure(figsize=(20,20))
cmd = ConfusionMatrixDisplay.from_estimator(model_gridDum3, X_testTrimmed, y_test3, normalize = 'all', display_labels=['Low','Medium','High'])
cmd.figure_.savefig('Dummy_LMH_Fig4F.png',dpi=300)
pipe6 = Pipeline([
('PCA', PCA()),
('RandomForestClassifier', RandomForestClassifier())
])
param_grid6 = {
'PCA__n_components': [1, 10, 30, 50, 70],
'RandomForestClassifier__n_estimators': [50, 150],
'RandomForestClassifier__criterion': ['gini', 'entropy'],
'RandomForestClassifier__max_depth': [8]
}
model_grid6 = GridSearchCV(pipe6, param_grid6, cv = 5, n_jobs = 2)
model_grid6.fit(X_clean4Full, y_clean4Full)
model_grid6.best_params_ ##Best hyperparameters
print(model_grid6.best_score_)
accuracy_score(y_test4, model_grid6.predict(X_testFull))
#Calculate the y_score
y_score = model_grid6.predict_proba(X_testFull)
##get auc score
roc_auc_score(y_test4, y_score, multi_class='ovr')
plt.figure(figsize=(20,20))
cmd = ConfusionMatrixDisplay.from_estimator(model_grid6, X_testFull, y_test4, normalize = 'all', display_labels=['Q1','Q2', 'Q3','Q4'])
cmd.figure_.savefig('PCARandom_Quart_Fig5F.png',dpi=300)
pipe7 = Pipeline([
('RandomForestClassifier', RandomForestClassifier())
])
param_grid7 = {
'RandomForestClassifier__n_estimators': [50, 150],
'RandomForestClassifier__criterion': ['gini', 'entropy'],
'RandomForestClassifier__max_depth': [8]
}
model_grid7 = GridSearchCV(pipe7, param_grid7, cv = 5, n_jobs = 5)
model_grid7.fit(X_clean4Trimmed, y_clean4Trimmed)
model_grid7.best_params_ ##Best hyperparameters
y_predicted = model_grid7.predict(X_testTrimmed)
y_predicted[1:20]
print(confusion_matrix(y_test4, y_predicted))
print(classification_report(y_test4, y_predicted))
print(model_grid7.best_score_)
accuracy_score(y_test4, model_grid7.predict(X_testTrimmed))
#Calculate the y_score
y_score = model_grid7.predict_proba(X_testTrimmed)
##get auc score
roc_auc_score(y_test4, y_score, multi_class='ovr')
plt.figure(figsize=(20,20))
cmd = ConfusionMatrixDisplay.from_estimator(model_grid7, X_testTrimmed, y_test4, normalize = 'all', display_labels=['Q1', 'Q2','Q3', 'Q4'])
cmd.figure_.savefig('Random_Quart_Fig6F.png',dpi=300)
pipeDT4 = Pipeline([
('DecisionTreeClassifier', DecisionTreeClassifier())
])
param_gridDT4 = {
'DecisionTreeClassifier__min_samples_leaf': [10, 50, 150, 300],
'DecisionTreeClassifier__criterion': ['gini', 'entropy'],
'DecisionTreeClassifier__max_depth': [5, 10, 15, 20]
}
model_gridDT4 = GridSearchCV(pipeDT4, param_gridDT4, cv = 5, n_jobs = 5)
model_gridDT4.fit(X_clean4Trimmed, y_clean4Trimmed)
print(model_gridDT4.best_params_) ##Best hyperparameters
##model evaluation
##get predictions
y_predicted = model_gridDT4.predict(X_testTrimmed)
print(confusion_matrix(y_test4, y_predicted))
print(classification_report(y_test4, y_predicted))
##print model score
print("Model Score", model_gridDT4.best_score_)
##Accuracy Score
print("Accuracy Score",accuracy_score(y_test4, model_gridDT4.predict(X_testTrimmed)))
##get auc score
#Calculate the y_score
y_score = model_gridDT4.predict_proba(X_testTrimmed)
print("AUC Score", roc_auc_score(y_test4, y_score, multi_class='ovr'))
plt.figure(figsize=(20,20))
cmd = ConfusionMatrixDisplay.from_estimator(model_gridDT4, X_testTrimmed, y_test4, normalize = 'all', display_labels=['Q1','Q2','Q3', 'Q4'])
cmd.figure_.savefig('DecisionTree_Quart_Fig5F.png',dpi=300)
clf = DecisionTreeClassifier(max_depth = 10, criterion = 'gini', min_samples_leaf = 50, random_state=21)
model = clf.fit(X_clean4Trimmed, y_clean4Trimmed)
pred = clf.predict(X_testTrimmed)
acc = accuracy_score(pred, y_test4)
acc
feature_cols = []
for i in X_clean4Trimmed.columns.values:
feature_cols.append(i)
type(feature_cols)
target_names=['Q1', 'Q2', 'Q3', 'Q4']
feature_cols
X = X_clean4Trimmed
y = y_clean4Trimmed
fig = plt.figure(figsize=(45,20))
_ = tree.plot_tree(clf, fontsize=12,
feature_names=feature_cols,
class_names=target_names,
filled=True)
fig.savefig('Tree4Class.png',dpi=300)
pipeDum4 = Pipeline([
('DummyClassifier', DummyClassifier())
])
param_gridDum4 = {
'DummyClassifier__strategy': ['most_frequent', 'prior', 'stratified', 'uniform'],
}
model_gridDum4 = GridSearchCV(pipeDum4, param_gridDum4, cv = 5, n_jobs = 5)
model_gridDum4.fit(X_clean4Trimmed, y_clean4Trimmed)
model_gridDum4.best_params_ ##Best hyperparameters
y_predicted = model_gridDum4.predict(X_testTrimmed)
y_predicted[1:20]
print(confusion_matrix(y_test4, y_predicted))
print(classification_report(y_test4, y_predicted))
##print model score
print("Model Score:", model_gridDum4.best_score_)
##Accuracy Score
print("Accuracy Score:",accuracy_score(y_test4, model_gridDum4.predict(X_testTrimmed)))
#Calculate the y_score
y_score = model_gridDum4.predict_proba(X_testTrimmed)
print("AUC Score", roc_auc_score(y_test4, y_score, multi_class='ovr'))
plt.figure(figsize=(20,20))
cmd = ConfusionMatrixDisplay.from_estimator(model_gridDum4, X_testTrimmed, y_test4, normalize = 'all', display_labels=['Q1','Q2','Q3', 'Q4'])
cmd.figure_.savefig('Dummy_Quart_Fig5F.png',dpi=300)
X_train = train.loc[:, ~dropped.columns.isin(['unitid', 'Grad_Rates_Two_Classes', 'exp_acad_inst_student_total_per','Grad_Rates_Three_Classes', 'Grad_Rates_Quartiles', 'completion_rate_150pct', 'year', 'cc_basic_2010', 'act_composite_75_pctl', 'act_composite_25_pctl', 'acceptance_rate'])]
X_test = test.loc[:, ~dropped.columns.isin(['unitid', 'Grad_Rates_Two_Classes', 'exp_acad_inst_student_total_per','Grad_Rates_Three_Classes', 'Grad_Rates_Quartiles', 'completion_rate_150pct', 'year', 'cc_basic_2010', 'act_composite_75_pctl', 'act_composite_25_pctl', 'acceptance_rate'])]
y_train = train.completion_rate_150pct
y_test = test.completion_rate_150pct
from sklearn.covariance import EllipticEnvelope
envelope1 = EllipticEnvelope(support_fraction=1, contamination=0.03).fit(X_train)
outliers1 = envelope1.predict(X_train)==-1
X_clean1 = X_train[~outliers1]
y_clean1 = y_train[~outliers1]
print(f"Num of outliers = {np.sum(outliers1)}")
pipe20 = Pipeline([
('PCA', PCA()),
('LinearRegression', LinearRegression())
])
# Configure the parameters for grid search
param_grid20 = {
'PCA__n_components': [1, 10, 30, 50, 55],
'LinearRegression__n_jobs': [1,3,5]
}
# Train the Pipeline with Grid Search
model_grid20 = GridSearchCV(pipe20, param_grid20, cv = 5, n_jobs = 5)
model_grid20.fit(X_clean1, y_clean1)
model_grid20.best_params_ ##Best hyperparameters
print(model_grid20.best_score_)
y_predicted = model_grid20.predict(X_test)
##explained variance
explained_variance_score(y_test, y_predicted)
##r2 score
r2_score(y_test, y_predicted)
plt.figure(figsize=(15,15))
plt.scatter(y_test, y_predicted, c='crimson')
#plt.yscale('log')
#plt.xscale('log')
p1 = max(max(y_predicted), max(y_test))
p2 = min(min(y_predicted), min(y_test))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=20)
plt.ylabel('Predictions', fontsize=20)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.axis('equal')
plt.savefig('RegressionAcutalVsPredicted_No_SelectorsFullSet.png',dpi=300)
X_train = train[['est_fte', 'exp_instruc_total_per', 'exp_acad_supp_total_per', 'exp_student_serv_total_per', 'exp_inst_supp_total_per']]
X_test = test[['est_fte', 'exp_instruc_total_per', 'exp_acad_supp_total_per', 'exp_student_serv_total_per', 'exp_inst_supp_total_per']]
y_train = train.completion_rate_150pct
y_test = test.completion_rate_150pct
X_train.head(1)
from sklearn.covariance import EllipticEnvelope
envelope1 = EllipticEnvelope(support_fraction=1, contamination=0.03).fit(X_train)
outliers1 = envelope1.predict(X_train)==-1
X_clean1 = X_train[~outliers1]
y_clean1 = y_train[~outliers1]
print(f"Num of outliers = {np.sum(outliers1)}")
pipe20 = Pipeline([
('LinearRegression', LinearRegression())
])
# Configure the parameters for grid search
param_grid20 = {
'LinearRegression__n_jobs': [1,3,5]
}
model_grid20 = GridSearchCV(pipe20, param_grid20, cv = 5, n_jobs = 5)
model_grid20.fit(X_clean1, y_clean1)
model_grid20.best_params_ ##Best hyperparameters
print(model_grid20.best_score_)
y_predicted = model_grid20.predict(X_test)
explained_variance_score(y_test, y_predicted)
##r2 score
r2_score(y_test, y_predicted)
plt.figure(figsize=(15,15))
plt.scatter(y_test, y_predicted, c='crimson')
#plt.yscale('log')
#plt.xscale('log')
p1 = max(max(y_predicted), max(y_test))
p2 = min(min(y_predicted), min(y_test))
plt.plot([p1, p2], [p1, p2], 'b-')
plt.xlabel('True Values', fontsize=20)
plt.ylabel('Predictions', fontsize=20)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.axis('equal')
plt.savefig('RegressionAcutalVsPredicted_No_Selectors.png',dpi=300)
X_train = train[['est_fte', 'exp_instruc_total_per', 'exp_acad_supp_total_per', 'exp_student_serv_total_per', 'exp_inst_supp_total_per']]
X_test = test[['est_fte', 'exp_instruc_total_per', 'exp_acad_supp_total_per', 'exp_student_serv_total_per', 'exp_inst_supp_total_per']]
y_train = train.Grad_Rates_Two_Classes
y_test = test.Grad_Rates_Two_Classes
X_train.head(1)
from sklearn.covariance import EllipticEnvelope
envelope1 = EllipticEnvelope(support_fraction=1, contamination=0.03).fit(X_train)
outliers1 = envelope1.predict(X_train)==-1
X_clean1 = X_train[~outliers1]
y_clean1 = y_train[~outliers1]
print(f"Num of outliers = {np.sum(outliers1)}")
pipe8 = Pipeline([
('RandomForestClassifier', RandomForestClassifier())
])
param_grid8 = {
'RandomForestClassifier__n_estimators': [50, 100, 150],
'RandomForestClassifier__criterion': ['gini', 'entropy'],
'RandomForestClassifier__max_depth': [8]
}
model_grid8 = GridSearchCV(pipe8, param_grid8, cv = 5, n_jobs = 5)
model_grid8.fit(X_clean1, y_clean1)
model_grid8.best_params_ ##Best hyperparameters
y_predicted = model_grid8.predict(X_test)
y_predicted[1:20]
print(confusion_matrix(y_test, y_predicted))
print(classification_report(y_test, y_predicted))
print(model_grid8.best_score_)
accuracy_score(y_test, model_grid8.predict(X_test))
##get predictions
y_predicted = model_grid8.predict(X_test)
##Auc Score
fpr, tpr, thresholds = roc_curve(y_test, y_predicted)
auc(fpr, tpr)
plt.figure(figsize=(20,20))
cmd = ConfusionMatrixDisplay.from_estimator(model_grid8, X_test, y_test, normalize = 'all', display_labels=['Low', 'High'])
cmd.figure_.savefig('Random_LH_Fig7F.png',dpi=300)
clf = DecisionTreeClassifier(max_depth = 4, random_state=21)
model = clf.fit(X_clean1, y_clean1)
pred = clf.predict(X_test)
acc = accuracy_score(pred, y_test)
acc
feature_cols = []
for i in X_clean1.columns.values:
feature_cols.append(i)
type(feature_cols)
target_names=['Low', 'High']
feature_cols
X = X_clean1
y = y_clean1
fig = plt.figure(figsize=(35,20))
_ = tree.plot_tree(clf, fontsize=12,
feature_names=feature_cols,
class_names=target_names,
filled=True)
fig.savefig('Tree2ClassNoSelectivityF.png',dpi=300)
DumPipe = Pipeline([
('DummyClassifier', DummyClassifier())
])
Dum_param_grid = {
'DummyClassifier__strategy': ['most_frequent', 'prior', 'stratified', 'uniform'],
}
dum_model = GridSearchCV(DumPipe, Dum_param_grid, cv = 5, n_jobs = 5)
dum_model.fit(X_clean1, y_clean1)
print("Best Hyperparameters:",dum_model.best_params_) ##Best hyperparameters
y_predicted = dum_model.predict(X_test)
y_predicted[1:20]
print(confusion_matrix(y_test, y_predicted))
print(classification_report(y_test, y_predicted))
##print model score
print("Model Score:", dum_model.best_score_)
##Accuracy Score
print("Accuracy Score:",accuracy_score(y_test, dum_model.predict(X_test)))
##AUC
##get predictions
y_predicted = dum_model.predict(X_test)
##Auc Score
fpr, tpr, thresholds = roc_curve(y_test, y_predicted)
print("AUC Score:",auc(fpr, tpr))
plt.figure(figsize=(20,20))
cmd = ConfusionMatrixDisplay.from_estimator(dum_model, X_test, y_test, normalize = 'all', display_labels=['Low', 'High'])
cmd.figure_.savefig('Dummy_LH_Fig7F.png',dpi=300)
X_train = train[['est_fte', 'exp_instruc_total_per', 'exp_acad_supp_total_per', 'exp_student_serv_total_per', 'exp_inst_supp_total_per']]
X_test = test[['est_fte', 'exp_instruc_total_per', 'exp_acad_supp_total_per', 'exp_student_serv_total_per', 'exp_inst_supp_total_per']]
y_train = train.Grad_Rates_Three_Classes
y_test = test.Grad_Rates_Three_Classes
X_train.head(1)
from sklearn.covariance import EllipticEnvelope
envelope1 = EllipticEnvelope(support_fraction=1, contamination=0.03).fit(X_train)
outliers1 = envelope1.predict(X_train)==-1
X_clean1 = X_train[~outliers1]
y_clean1 = y_train[~outliers1]
print(f"Num of outliers = {np.sum(outliers1)}")
pipe9 = Pipeline([
('RandomForestClassifier', RandomForestClassifier())
])
param_grid9 = {
'RandomForestClassifier__n_estimators': [50, 100, 150],
'RandomForestClassifier__criterion': ['gini', 'entropy'],
'RandomForestClassifier__max_depth': [8]
}
model_grid9 = GridSearchCV(pipe9, param_grid9, cv = 5, n_jobs = 5)
model_grid9.fit(X_clean1, y_clean1)
model_grid9.best_params_ ##Best hyperparameters
y_predicted = model_grid9.predict(X_test)
y_predicted[1:20]
print(confusion_matrix(y_test, y_predicted))
print(classification_report(y_test, y_predicted))
print(model_grid9.best_score_)
accuracy_score(y_test, model_grid9.predict(X_test))
#Calculate the y_score
y_score = model_grid9.predict_proba(X_test)
##get auc score
roc_auc_score(y_test, y_score, multi_class='ovr')
plt.figure(figsize=(20,20))
cmd = ConfusionMatrixDisplay.from_estimator(model_grid9, X_test, y_test, normalize = 'all', display_labels=['Low', 'Medium', 'High'])
cmd.figure_.savefig('Random_LMH_Fig8F.png',dpi=300)
clf = DecisionTreeClassifier(max_depth = 4, random_state=21)
model = clf.fit(X_clean1, y_clean1)
pred = clf.predict(X_test)
acc = accuracy_score(pred, y_test)
acc
feature_cols = []
for i in X_clean1.columns.values:
feature_cols.append(i)
type(feature_cols)
target_names=['Low', 'Medium', 'High']
feature_cols
X = X_clean1
y = y_clean1
fig = plt.figure(figsize=(45,20))
_ = tree.plot_tree(clf, fontsize=12,
feature_names=feature_cols,
class_names=target_names,
filled=True)
fig.savefig('Tree3ClassNoSelectivityF.png',dpi=300)
DumPipe = Pipeline([
('DummyClassifier', DummyClassifier())
])
Dum_param_grid = {
'DummyClassifier__strategy': ['most_frequent', 'prior', 'stratified', 'uniform'],
}
dum_model = GridSearchCV(DumPipe, Dum_param_grid, cv = 5, n_jobs = 5)
dum_model.fit(X_clean1, y_clean1)
print("Best Hyperparameters:",dum_model.best_params_) ##Best hyperparameters
y_predicted = dum_model.predict(X_test)
y_predicted[1:20]
print(confusion_matrix(y_test, y_predicted))
print(classification_report(y_test, y_predicted))
##print model score
print("Model Score:", dum_model.best_score_)
##Accuracy Score
print("Accuracy Score:",accuracy_score(y_test, dum_model.predict(X_test)))
#Calculate the y_score
y_score = dum_model.predict_proba(X_test)
print("AUC Score", roc_auc_score(y_test, y_score, multi_class='ovr'))
plt.figure(figsize=(20,20))
cmd = ConfusionMatrixDisplay.from_estimator(dum_model, X_test, y_test, normalize = 'all', display_labels=['Low', 'Medium', 'High'])
cmd.figure_.savefig('Dummy_LMH_Fig8F.png',dpi=300)
X_train = train[['est_fte', 'exp_instruc_total_per', 'exp_acad_supp_total_per', 'exp_student_serv_total_per', 'exp_inst_supp_total_per']]
X_test = test[['est_fte', 'exp_instruc_total_per', 'exp_acad_supp_total_per', 'exp_student_serv_total_per', 'exp_inst_supp_total_per']]
y_train = train.Grad_Rates_Quartiles
y_test = test.Grad_Rates_Quartiles
X_train.head(1)
from sklearn.covariance import EllipticEnvelope
envelope1 = EllipticEnvelope(support_fraction=1, contamination=0.03).fit(X_train)
outliers1 = envelope1.predict(X_train)==-1
X_clean1 = X_train[~outliers1]
y_clean1 = y_train[~outliers1]
print(f"Num of outliers = {np.sum(outliers1)}")
pipe10 = Pipeline([
('RandomForestClassifier', RandomForestClassifier())
])
param_grid10 = {
'RandomForestClassifier__n_estimators': [50, 100, 150],
'RandomForestClassifier__criterion': ['gini', 'entropy'],
'RandomForestClassifier__max_depth': [8]
}
model_grid10 = GridSearchCV(pipe10, param_grid10, cv = 5, n_jobs = 5)
model_grid10.fit(X_clean1, y_clean1)
model_grid10.best_params_ ##Best hyperparameters
y_predicted = model_grid10.predict(X_test)
y_predicted[1:20]
print(confusion_matrix(y_test, y_predicted))
print(classification_report(y_test, y_predicted))
print(model_grid10.best_score_)
accuracy_score(y_test, model_grid10.predict(X_test))
#Calculate the y_score
y_score = model_grid10.predict_proba(X_test)
##get auc score
roc_auc_score(y_test, y_score, multi_class='ovr')
plt.figure(figsize=(20,20))
cmd = ConfusionMatrixDisplay.from_estimator(model_grid10, X_test, y_test, normalize = 'all', display_labels=['Q1', 'Q2', 'Q3', 'Q4'])
cmd.figure_.savefig('Random_Quart_Fig9F.png',dpi=300)
clf = DecisionTreeClassifier(max_depth = 4, random_state=21)
model = clf.fit(X_clean1, y_clean1)
pred = clf.predict(X_test)
acc = accuracy_score(pred, y_test)
acc
feature_cols = []
for i in X_clean1.columns.values:
feature_cols.append(i)
type(feature_cols)
target_names=['Q1', 'Q2', 'Q3', 'Q4']
feature_cols
X = X_clean1
y = y_clean1
fig = plt.figure(figsize=(45,20))
_ = tree.plot_tree(clf, fontsize=12,
feature_names=feature_cols,
class_names=target_names,
filled=True)
fig.savefig('Tree4ClassNoSelectivityF.png',dpi=300)
DumPipe = Pipeline([
('DummyClassifier', DummyClassifier())
])
Dum_param_grid = {
'DummyClassifier__strategy': ['most_frequent', 'prior', 'stratified', 'uniform'],
}
dum_model = GridSearchCV(DumPipe, Dum_param_grid, cv = 5, n_jobs = 5)
dum_model.fit(X_clean1, y_clean1)
print("Best Hyperparameters:",dum_model.best_params_) ##Best hyperparameters
y_predicted = dum_model.predict(X_test)
y_predicted[1:20]
print(confusion_matrix(y_test, y_predicted))
print(classification_report(y_test, y_predicted))
##print model score
print("Model Score:", dum_model.best_score_)
##Accuracy Score
print("Accuracy Score:",accuracy_score(y_test, dum_model.predict(X_test)))
#Calculate the y_score
y_score = dum_model.predict_proba(X_test)
print("AUC Score", roc_auc_score(y_test, y_score, multi_class='ovr'))
plt.figure(figsize=(20,20))
cmd = ConfusionMatrixDisplay.from_estimator(dum_model, X_test, y_test, normalize = 'all', display_labels=['Q1', 'Q2', 'Q3', 'Q4'])
cmd.figure_.savefig('Dummy_Quart_Fig9F.png',dpi=300)